Data list

Client.csv
Disabilities.csv
EmploymentEducation.csv
Enrollment.csv
Exit.csv
HealthAndDv.csv
IncomeBenefits.csv
Services.csv

Load Data

setwd("C:/Users/Kyu/Google Drive/Hackathon/data")

Client <- read.csv("Client.csv", na.strings=c("", " ", "NULL", NA), stringsAsFactors=FALSE)
Disabilities <- read.csv("Disabilities.csv", na.strings=c("", " ", "NULL", NA))
EmploymentEducation <- read.csv("EmploymentEducation.csv", na.strings=c("", " ", "NULL", NA))
Enrollment <- read.csv("Enrollment.csv", na.strings=c("", " ", "NULL", NA))
Exit <- read.csv("Exit.csv", na.strings=c("", " ", "NULL", NA))
HealthAndDV <- read.csv("HealthAndDV.csv", na.strings=c("", " ", "NULL", NA))
IncomeBenefits <- read.csv("IncomeBenefits.csv", na.strings=c("", " ", "NULL", NA))
Services <- read.csv("Services.csv", na.strings=c("", " ", "NULL", NA))

Cleaning Missing or Unnecessary features

no_use_col <- c("DateCreated", "DateUpdated", "UserID", "ExportID", "ProjectEntryID")

# Since, these data sets are very imbalance, i will delete them
Client$First_Name <- NULL
Client$Middle_Name <- NULL
Client$Last_Name <- NULL
Client$SSN <- NULL
Client$OtherGender <- NULL
Client$Name_Data_Quality <- NULL
Client$SSNDataQuality <- NULL
Client$DOBDataQuality <- NULL
names(Client)[1] <- paste("PersonalID") # change UUID to PersonalID for joining dataset purpose
sapply(Client, function(x){sum(is.na(x))/length(x)})*100
##           PersonalID                  DOB        AmIndAKNative 
##             0.000000             2.990033             0.000000 
##                Asian                Black NativeHIOtherPacific 
##             0.000000             0.000000             0.000000 
##                White             RaceNone               Gender 
##             0.000000             0.000000             0.000000 
##        VeteranStatus   YearEnteredService        YearSeparated 
##             0.000000            35.880399            35.880399 
##           WorldWarII            KoreanWar           VietnamWar 
##             0.000000             0.000000             0.000000 
##          DesertStorm       AfghanistanOEF              IraqOIF 
##             0.000000             0.000000             0.000000 
##              IraqOND         OtherTheater       MilitaryBranch 
##             0.000000             0.000000            33.887043 
##     Discharge_Status 
##            33.887043
# remove ~50% missing values. 
cat_Disabilities <- sapply(Disabilities, function(x){sum(is.na(x))/length(x)})*100
Disabilities <- subset(Disabilities, select = cat_Disabilities < 5 )
Disabilities <- Disabilities[, !colnames(Disabilities) %in% no_use_col]
Disabilities$DisabilitiesID <- NULL
sapply(Disabilities, function(x){sum(is.na(x))/length(x)})*100
##          PersonalID     InformationDate      DisabilityType 
##                   0                   0                   0 
##  DisabilityResponse DataCollectionStage 
##                   0                   0
# remove ~50% missing values. 
cat_EmploymentEducation <- sapply(EmploymentEducation, function(x){sum(is.na(x))/length(x)})*100
EmploymentEducation <- subset(EmploymentEducation, select = cat_EmploymentEducation < 5 )
EmploymentEducation <- EmploymentEducation[, !colnames(EmploymentEducation) %in% no_use_col]
EmploymentEducation$EmploymentEducationID <- NULL
sapply(EmploymentEducation, function(x){sum(is.na(x))/length(x)})*100
##          PersonalID     InformationDate            Employed 
##                   0                   0                   0 
## DataCollectionStage 
##                   0
# remove ~50% missing values. 
cat_Enrollment <- sapply(Enrollment, function(x){sum(is.na(x))/length(x)})*100
Enrollment <- subset(Enrollment, select = cat_Enrollment < 5 )
Enrollment <- Enrollment[, !colnames(Enrollment) %in% no_use_col]
Enrollment$HouseholdID <- NULL
Enrollment$ProjectID <- NULL
sapply(Enrollment, function(x){sum(is.na(x))/length(x)})*100
##          PersonalID           EntryDate   RelationshipToHoH 
##            0.000000            0.000000            0.000000 
## LastPermanentStreet   LastPermanentCity  LastPermanentState 
##            4.532578            4.532578            4.532578 
##    LastPermanentZIP 
##            4.532578
# remove ~50% missing values. 
cat_Exit <- sapply(Exit, function(x){sum(is.na(x))/length(x)})*100
Exit <- subset(Exit, select = cat_Exit < 5 )
Exit <- Exit[, !colnames(Exit) %in% no_use_col]
Exit$ExitID <- NULL
sapply(Exit, function(x){sum(is.na(x))/length(x)})*100
##  PersonalID    ExitDate Destination 
##        0.00        0.00        1.25
# remove ~50% missing values. 
cat_HealthAndDV <- sapply(HealthAndDV, function(x){sum(is.na(x))/length(x)})*100
HealthAndDV <- subset(HealthAndDV, select = cat_HealthAndDV < 5 )
HealthAndDV <- HealthAndDV[, !colnames(HealthAndDV) %in% no_use_col]
HealthAndDV$HealthAndDVID <- NULL
HealthAndDV$DueDate <- NULL
sapply(HealthAndDV, function(x){sum(is.na(x))/length(x)})*100
##             PersonalID        InformationDate DomesticViolenceVictim 
##                      0                      0                      0 
##    DataCollectionStage 
##                      0
# remove ~50% missing values. 
cat_IncomeBenefits <- sapply(IncomeBenefits, function(x){sum(is.na(x))/length(x)})*100
IncomeBenefits <- subset(IncomeBenefits, select = cat_IncomeBenefits < 5 )
IncomeBenefits <- IncomeBenefits[, !colnames(IncomeBenefits) %in% no_use_col]
IncomeBenefits$IncomeBenefitsID <- NULL
sapply(IncomeBenefits, function(x){sum(is.na(x))/length(x)})*100
##              PersonalID         InformationDate                  Earned 
##                 0.00000                 0.00000                 0.00000 
##            Unemployment                     SSI                    SSDI 
##                 0.00000                 0.00000                 0.00000 
##     VADisabilityService  VADisabilityNonService       PrivateDisability 
##                 0.00000                 0.00000                 0.00000 
##             WorkersComp                    TANF                      GA 
##                 0.00000                 0.00000                 0.00000 
##        SocSecRetirement                 Pension            ChildSupport 
##                 0.00000                 0.00000                 0.00000 
##                 Alimony       OtherIncomeSource                    SNAP 
##                 0.00000                 0.00000                 0.00000 
##                     WIC           TANFChildCare      TANFTransportation 
##                 0.00000                 0.00000                 0.00000 
##               OtherTANF RentalAssistanceOngoing    RentalAssistanceTemp 
##                 0.00000                 0.00000                 0.00000 
##     OtherBenefitsSource  InsuranceFromAnySource                Medicaid 
##                 0.00000                 2.29682                 0.00000 
##                Medicare                   SCHIP       VAMedicalServices 
##                 0.00000                 0.00000                 0.00000 
##        EmployerProvided                   COBRA              PrivatePay 
##                 0.00000                 0.00000                 0.00000 
##          StateHealthIns     DataCollectionStage 
##                 0.00000                 0.00000
# remove ~50% missing values. 
cat_Services <- sapply(Services, function(x){sum(is.na(x))/length(x)})*100
Services <- subset(Services, select = cat_Services < 5 )
Services <- Services[, !colnames(Services) %in% no_use_col]
Services$ServicesID <- NULL
sapply(Services, function(x){sum(is.na(x))/length(x)})*100
##   PersonalID DateProvided   RecordType TypeProvided 
##            0            0            0            0

Feature Engeering

  1. Extracting important variables from the existing variables
  2. Impute missing data uising proper method.
  3. Dummyfy categorical data
  4. Remove zero variant variables
  5. Create new features

Client.csv

library(mlr)
library(lubridate) # for data
library(caret)

dim(Client)
## [1] 301  22
str(Client)
## 'data.frame':    301 obs. of  22 variables:
##  $ PersonalID          : int  90077 90703 90739 91161 91501 95126 95759 98250 99577 101863 ...
##  $ DOB                 : chr  "1/1/1953" "1/1/1953" "1/1/1971" "1/1/1952" ...
##  $ AmIndAKNative       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NativeHIOtherPacific: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ White               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RaceNone            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gender              : int  1 1 1 1 1 0 0 1 1 1 ...
##  $ VeteranStatus       : int  1 1 0 1 1 1 1 1 1 1 ...
##  $ YearEnteredService  : int  1971 1971 NA 1970 1974 2000 1982 1984 1980 1977 ...
##  $ YearSeparated       : int  1973 1973 NA 1973 1977 2003 1985 1987 1982 1983 ...
##  $ WorldWarII          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ KoreanWar           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VietnamWar          : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ DesertStorm         : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ AfghanistanOEF      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ IraqOIF             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ IraqOND             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherTheater        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MilitaryBranch      : int  4 4 NA 1 2 2 1 1 4 1 ...
##  $ Discharge_Status    : int  2 1 NA 1 1 1 2 1 1 1 ...
summarizeColumns(Client)
##                    name      type  na         mean         disp median
## 1            PersonalID   integer   0 2.243461e+05 4.778426e+04 247742
## 2                   DOB character   9           NA           NA     NA
## 3         AmIndAKNative   integer   0 6.644518e-03 8.137794e-02      0
## 4                 Asian   integer   0 0.000000e+00 0.000000e+00      0
## 5                 Black   integer   0 6.013289e-01 4.904402e-01      1
## 6  NativeHIOtherPacific   integer   0 0.000000e+00 0.000000e+00      0
## 7                 White   integer   0 3.953488e-01 4.897397e-01      0
## 8              RaceNone   integer   0 0.000000e+00 0.000000e+00      0
## 9                Gender   integer   0 1.388704e+00 8.025692e+00      1
## 10        VeteranStatus   integer   0 1.675415e+01 3.632950e+01      1
## 11   YearEnteredService   integer 108 1.984699e+03 1.148064e+01   1982
## 12        YearSeparated   integer 108 1.988679e+03 1.275655e+01   1986
## 13           WorldWarII   integer   0 0.000000e+00 0.000000e+00      0
## 14            KoreanWar   integer   0 0.000000e+00 0.000000e+00      0
## 15           VietnamWar   integer   0 2.325581e-02 1.509659e-01      0
## 16          DesertStorm   integer   0 2.657807e-02 1.611146e-01      0
## 17       AfghanistanOEF   integer   0 2.657807e-02 1.611146e-01      0
## 18              IraqOIF   integer   0 3.654485e-02 1.879540e-01      0
## 19              IraqOND   integer   0 0.000000e+00 0.000000e+00      0
## 20         OtherTheater   integer   0 0.000000e+00 0.000000e+00      0
## 21       MilitaryBranch   integer 102 2.015075e+00 1.178414e+00      1
## 22     Discharge_Status   integer 102 1.547739e+00 1.350826e+00      1
##          mad   min    max nlevs
## 1  2314.3386 90077 252131     0
## 2         NA     1     15    69
## 3     0.0000     0      1     0
## 4     0.0000     0      0     0
## 5     0.0000     0      1     0
## 6     0.0000     0      0     0
## 7     0.0000     0      1     0
## 8     0.0000     0      0     0
## 9     0.0000     0     99     0
## 10    0.0000     0     99     0
## 11   10.3782  1954   2014     0
## 12   10.3782  1947   2015     0
## 13    0.0000     0      0     0
## 14    0.0000     0      0     0
## 15    0.0000     0      1     0
## 16    0.0000     0      1     0
## 17    0.0000     0      1     0
## 18    0.0000     0      1     0
## 19    0.0000     0      0     0
## 20    0.0000     0      0     0
## 21    0.0000     1      4     0
## 22    0.0000     1      7     0
# check if the ID is unique in the dataset
nrow(Client) == length(table(Client$PersonalID))
## [1] TRUE
# extracting the DBO year
Client$DOB <- as.POSIXct(strptime(Client$DOB, format="%m/%d/%Y"))
Client$DOByear <- year(Client$DOB)
Client$DOB <- NULL

# cleaning YearEnteredService and YearSeparated 
# impute missing values by mean and mode
Client$YearEnteredService[is.na(as.numeric(Client$YearEnteredService))] <- mean(Client$YearEnteredService, na.rm=T)
Client$YearSeparated[is.na(as.numeric(Client$YearSeparated))] <- mean(Client$YearSeparated, na.rm=T)

# change to factor variables
non_fector = c("PersonalID", "Name_Data_Quality", "SSNDataQuality", "DOBDataQuality", "YearEnteredService", "YearSeparated", "DOByear")
for(i in names(Client[,!colnames(Client) %in% non_fector])) { 
    Client[,names(Client) == i ] <- as.factor(Client[,names(Client) == i ])
}

# cleaning Gender var
# Since Gender has very imbalance dataset, I'm going to bin minority genders
levels(Client$Gender)[levels(Client$Gender) %in% 2] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 3] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 9] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 99] <- 1
table(Client$Gender, useNA = "always")
## 
##    0    1 <NA> 
##   91  210    0
# cleaning MilitaryBranch var
# create another level of factor for missing data
Client$MilitaryBranch <- as.numeric(Client$MilitaryBranch)
Client$MilitaryBranch[is.na(Client$MilitaryBranch)] <-  0
Client$MilitaryBranch <- as.factor(Client$MilitaryBranch)

# Bin minority Military Branch factors
levels(Client$MilitaryBranch)[levels(Client$MilitaryBranch) %in% 3] <- 2
levels(Client$MilitaryBranch)[levels(Client$MilitaryBranch) %in% 4] <- 2
table(Client$MilitaryBranch, useNA = "always")
## 
##    0    1    2 <NA> 
##  102  103   96    0
# cleaning Discharge_Status var
# create another level of factor for missing data
Client$Discharge_Status <- as.numeric(Client$Discharge_Status)
Client$Discharge_Status[is.na(Client$Discharge_Status)] <-  0
Client$Discharge_Status <- as.factor(Client$Discharge_Status)

# Bin minority Discharge_Status fators
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 3] <- 2
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 4] <- 2
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 5] <- 2
table(Client$Discharge_Status, useNA = "always")
## 
##    0    1    2 <NA> 
##  102  150   49    0
# cleaning DOByear var
Client$DOByear[is.na(Client$DOByear)] <- median(Client$DOByear, na.rm=T)

sapply(Client, function(x){sum(is.na(x))/length(x)})*100
##           PersonalID        AmIndAKNative                Asian 
##                    0                    0                    0 
##                Black NativeHIOtherPacific                White 
##                    0                    0                    0 
##             RaceNone               Gender        VeteranStatus 
##                    0                    0                    0 
##   YearEnteredService        YearSeparated           WorldWarII 
##                    0                    0                    0 
##            KoreanWar           VietnamWar          DesertStorm 
##                    0                    0                    0 
##       AfghanistanOEF              IraqOIF              IraqOND 
##                    0                    0                    0 
##         OtherTheater       MilitaryBranch     Discharge_Status 
##                    0                    0                    0 
##              DOByear 
##                    0
Client$Black <- NULL
Client$White <- NULL

# convert them into dummy variables
dummies <- dummyVars("~ Gender + VeteranStatus + MilitaryBranch + Discharge_Status", data=Client, fullRank=T)
dummies.df <- as.data.frame(predict(dummies, Client))
Client <- cbind(Client, dummies.df)
Client$Gender <- NULL
Client$VeteranStatus <- NULL
Client$MilitaryBranch <- NULL
Client$Discharge_Status <- NULL

# change binary flag factor into numerical value again
for(i in names(Client[,!colnames(Client) %in% non_fector])) { 
    Client[,names(Client) == i ] <- as.numeric(Client[,names(Client) == i ])
}

# remove zero variance
nzCol <- nearZeroVar(Client, saveMetrics = TRUE)
Client <- Client[, nzCol$nzv == FALSE]

# cleaning age
Client$Age <- 2016 - Client$DOByear
Client[Client$Age < 18, "Age"] <- round(mean(Client$Age))

for (i in seq(length(Client$Age)))
    age <- paste(substr(Client$Age, start=0, stop=1))

Client$Age <- age
Client$YearEnteredService <- NULL
Client$YearSeparated <- NULL
Client$DOByear <- NULL

Disabilities.csv

dim(Disabilities)
## [1] 3318    5
str(Disabilities)
## 'data.frame':    3318 obs. of  5 variables:
##  $ PersonalID         : int  90077 90077 90077 90077 90077 90077 90077 90077 90077 90077 ...
##  $ InformationDate    : Factor w/ 166 levels "1/11/2016","1/12/2016",..: 38 10 53 129 38 10 53 129 38 10 ...
##  $ DisabilityType     : int  5 5 5 5 6 6 6 6 7 7 ...
##  $ DisabilityResponse : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage: int  1 2 3 2 1 2 3 2 1 2 ...
summarizeColumns(Disabilities)
##                  name    type na         mean         disp   median
## 1          PersonalID integer  0 2.237205e+05 4.809816e+04 247333.0
## 2     InformationDate  factor  0           NA 9.764919e-01       NA
## 3      DisabilityType integer  0 7.500000e+00 1.708083e+00      7.5
## 4  DisabilityResponse integer  0 2.546715e-01 5.913156e-01      0.0
## 5 DataCollectionStage integer  0 1.735986e+00 8.731280e-01      1.0
##         mad   min    max nlevs
## 1 2453.7030 90077 252131     0
## 2        NA     6     78   166
## 3    2.2239     5     10     0
## 4    0.0000     0      3     0
## 5    0.0000     1      3     0
# Checking if the ID is unique in the dataset
nrow(Disabilities) == length(table(Disabilities$PersonalID))
## [1] FALSE
# extracting the year
Disabilities$InformationDate <- as.POSIXct(strptime(Disabilities$InformationDate, format="%m/%d/%Y"))
Disabilities$InformationDateMonth <- month(Disabilities$InformationDate)
Disabilities$InformationDate <- NULL

table(Disabilities$InformationDateYear, useNA = "always")
## 
## <NA> 
##    0

EmploymentEducation.csv

dim(EmploymentEducation)
## [1] 192   4
str(EmploymentEducation)
## 'data.frame':    192 obs. of  4 variables:
##  $ PersonalID         : int  117753 181183 117753 182784 90739 182784 106089 106089 240712 240713 ...
##  $ InformationDate    : Factor w/ 113 levels "1/13/2016","1/14/2016",..: 8 53 66 68 107 41 21 3 86 86 ...
##  $ Employed           : int  1 1 1 1 1 1 0 0 0 0 ...
##  $ DataCollectionStage: int  1 1 3 1 1 3 1 3 1 1 ...
summarizeColumns(EmploymentEducation)
##                  name    type na         mean         disp   median
## 1          PersonalID integer  0 2.241574e+05 4.706670e+04 246509.5
## 2     InformationDate  factor  0           NA 9.583333e-01       NA
## 3            Employed integer  0 6.583333e+00 3.013640e+00      8.0
## 4 DataCollectionStage integer  0 1.635417e+00 8.072939e-01      1.0
##        mad   min    max nlevs
## 1 2104.551 90077 249591     0
## 2       NA     1      8   113
## 3    0.000     0      8     0
## 4    0.000     1      3     0
# Checking if the ID is unique in the dataset
nrow(EmploymentEducation) == length(table(EmploymentEducation$PersonalID))
## [1] FALSE
# extracting the year
EmploymentEducation$InformationDate <- as.POSIXct(strptime(EmploymentEducation$InformationDate, format="%m/%d/%Y"))
EmploymentEducation$InformationDateMonth <- month(EmploymentEducation$InformationDate)
EmploymentEducation$InformationDate <- NULL

Enrollment.csv

dim(Enrollment)
## [1] 353   7
str(Enrollment)
## 'data.frame':    353 obs. of  7 variables:
##  $ PersonalID         : int  117753 117753 181183 182784 90739 106089 240712 240713 240713 240714 ...
##  $ EntryDate          : Factor w/ 130 levels "1/12/2016","1/13/2016",..: 11 11 72 89 124 21 105 105 105 105 ...
##  $ RelationshipToHoH  : int  1 1 1 1 1 1 1 2 2 2 ...
##  $ LastPermanentStreet: Factor w/ 225 levels "0","1 Jefferson Barracks Drive",..: 81 112 34 97 181 103 74 73 73 72 ...
##  $ LastPermanentCity  : Factor w/ 28 levels "BELLEVILLE","BISMARCK",..: 21 21 21 21 21 21 21 21 21 21 ...
##  $ LastPermanentState : Factor w/ 3 levels "IL","MO","ZZ": 2 2 2 2 2 2 2 2 2 2 ...
##  $ LastPermanentZIP   : int  63118 63118 63138 63118 63104 63118 63135 63135 63135 63135 ...
summarizeColumns(Enrollment)
##                  name    type na         mean         disp median
## 1          PersonalID integer  0 2.219889e+05 4.961181e+04 247742
## 2           EntryDate  factor  0           NA 9.688385e-01     NA
## 3   RelationshipToHoH integer  0 1.518414e+00 1.000185e+00      1
## 4 LastPermanentStreet  factor 16           NA           NA     NA
## 5   LastPermanentCity  factor 16           NA           NA     NA
## 6  LastPermanentState  factor 16           NA           NA     NA
## 7    LastPermanentZIP integer 16 6.069350e+04 1.217828e+04  63114
##         mad   min    max nlevs
## 1 2314.3386 90077 252131     0
## 2        NA     1     11   130
## 3    0.0000     1      5     0
## 4        NA     1     20   225
## 5        NA     1    242    28
## 6        NA    13    311     3
## 7   19.2738     0  65401     0
# Checking if the ID is unique in the dataset
nrow(Enrollment) == length(table(Enrollment$PersonalID))
## [1] FALSE
# extracting the year
Enrollment$EntryDate <- as.POSIXct(strptime(Enrollment$EntryDate, format="%m/%d/%Y"))
Enrollment$EntryDateMonth <- month(Enrollment$EntryDate)

# extracing geo information through the zipcode
library(zipcode)
data(zipcode)
Enrollment  <- merge(Enrollment, zipcode, by.x='LastPermanentZIP', by.y='zip')

Enrollment$LastPermanentStreet <- NULL
Enrollment$LastPermanentCity <- NULL
Enrollment$LastPermanentState <- NULL
Enrollment$city <- NULL
Enrollment$state <- NULL

# bin small dataset 
Enrollment[Enrollment$RelationshipToHoH == 4, 'RelationshipToHoH'] <- 3
Enrollment[Enrollment$RelationshipToHoH == 5, 'RelationshipToHoH'] <- 3
table(Enrollment$RelationshipToHoH)
## 
##   1   2   3 
## 246  40  38
Enrollment$RelationshipToHoH <- as.factor(Enrollment$RelationshipToHoH)
dummies <- dummyVars("~ RelationshipToHoH", data=Enrollment, fullRank=T)
dummies.df <- as.data.frame(predict(dummies, Enrollment))
Enrollment <- cbind(Enrollment, dummies.df)
Enrollment$RelationshipToHoH <- NULL

Exit.csv

dim(Exit)
## [1] 160   3
str(Exit)
## 'data.frame':    160 obs. of  3 variables:
##  $ PersonalID : int  117753 181183 182784 106089 240712 174821 240942 90077 243203 245519 ...
##  $ ExitDate   : Factor w/ 62 levels "1/11/2016","1/12/2016",..: 61 58 41 5 4 17 17 29 4 56 ...
##  $ Destination: int  19 NA 10 22 10 10 10 10 10 22 ...
summarizeColumns(Exit)
##          name    type na         mean         disp   median      mad   min
## 1  PersonalID integer  0 222930.31875 47841.456123 247332.5 2226.865 90077
## 2    ExitDate  factor  0           NA     0.937500       NA       NA     1
## 3 Destination integer  2     12.48734     5.186628     10.0    0.000     2
##      max nlevs
## 1 252015     0
## 2     10    62
## 3     23     0
# Checking if the ID is unique in the dataset
nrow(Exit) == length(table(Exit$PersonalID))
## [1] FALSE
# extracting the year
Exit$ExitDate <- as.POSIXct(strptime(Exit$ExitDate, format="%m/%d/%Y"))
Exit$ExitDateMonth <- month(Exit$ExitDate)

table(Exit$Destination, useNA = "always")
## 
##    2    3    7    9   10   11   12   13   16   19   20   22   23 <NA> 
##    4    7    2    1   88    3    2    2    1   37    2    8    1    2
Exit$Destination[is.na(Exit$Destination)] <-  2

# Bin some data and fit them in to 0 to 5
Exit$Destination <- as.numeric(Exit$Destination)
Exit[Exit$Destination == 7, 'Destination'] <- 1
Exit[Exit$Destination == 16, 'Destination'] <- 1
Exit[Exit$Destination == 9, 'Destination'] <- 1
Exit[Exit$Destination == 14, 'Destination'] <- 1
Exit[Exit$Destination == 2, 'Destination'] <- 1
Exit[Exit$Destination == 25, 'Destination'] <- 2
Exit[Exit$Destination == 3, 'Destination'] <- 2
Exit[Exit$Destination == 22, 'Destination'] <- 2
Exit[Exit$Destination == 23, 'Destination'] <- 3
Exit[Exit$Destination == 19, 'Destination'] <- 3
Exit[Exit$Destination == 20, 'Destination'] <- 4
Exit[Exit$Destination == 21, 'Destination'] <- 4
Exit[Exit$Destination == 12, 'Destination'] <- 4
Exit[Exit$Destination == 13, 'Destination'] <- 5
Exit[Exit$Destination == 10, 'Destination'] <- 5
Exit[Exit$Destination == 11, 'Destination'] <- 5

HealthAndDV.csv

dim(HealthAndDV)
## [1] 430   4
str(HealthAndDV)
## 'data.frame':    430 obs. of  4 variables:
##  $ PersonalID            : int  117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
##  $ InformationDate       : Factor w/ 164 levels "1/11/2016","1/12/2016",..: 16 93 113 115 156 71 101 30 5 135 ...
##  $ DomesticViolenceVictim: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage   : int  1 1 3 1 1 3 3 1 3 1 ...
summarizeColumns(HealthAndDV)
##                     name    type na         mean         disp median
## 1             PersonalID integer  0 2.189243e+05 5.215057e+04 247069
## 2        InformationDate  factor  0           NA 9.767442e-01     NA
## 3 DomesticViolenceVictim integer  0 7.093023e-01 2.124704e+00      0
## 4    DataCollectionStage integer  0 1.586047e+00 7.996448e-01      1
##        mad   min    max nlevs
## 1 3048.967 90077 252131     0
## 2       NA     1     10   164
## 3    0.000     0      8     0
## 4    0.000     1      3     0
# Checking if the ID is unique in the dataset
nrow(HealthAndDV) == length(table(HealthAndDV$PersonalID))
## [1] FALSE
HealthAndDV$InformationDate <- NULL

IncomeBenefits.csv

dim(IncomeBenefits)
## [1] 566  35
str(IncomeBenefits)
## 'data.frame':    566 obs. of  35 variables:
##  $ PersonalID             : int  117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
##  $ InformationDate        : Factor w/ 170 levels "1/11/2016","1/12/2016",..: 19 99 119 121 162 76 107 33 5 141 ...
##  $ Earned                 : int  1 1 1 1 1 1 1 0 0 0 ...
##  $ Unemployment           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SSI                    : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ SSDI                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VADisabilityService    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ VADisabilityNonService : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PrivateDisability      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ WorkersComp            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TANF                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GA                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SocSecRetirement       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pension                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ChildSupport           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Alimony                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherIncomeSource      : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ SNAP                   : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ WIC                    : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ TANFChildCare          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TANFTransportation     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherTANF              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RentalAssistanceOngoing: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RentalAssistanceTemp   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherBenefitsSource    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ InsuranceFromAnySource : int  NA NA NA 0 0 1 0 0 NA 1 ...
##  $ Medicaid               : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ Medicare               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SCHIP                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VAMedicalServices      : int  1 0 1 0 0 0 0 1 0 1 ...
##  $ EmployerProvided       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ COBRA                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PrivatePay             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ StateHealthIns         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage    : int  1 1 3 1 1 3 3 1 3 1 ...
summarizeColumns(IncomeBenefits)
##                       name    type na         mean         disp median
## 1               PersonalID integer  0 2.234312e+05 4.837040e+04 247333
## 2          InformationDate  factor  0           NA 9.770318e-01     NA
## 3                   Earned integer  0 2.508834e-01 4.339050e-01      0
## 4             Unemployment integer  0 1.766784e-02 1.318575e-01      0
## 5                      SSI integer  0 1.183746e-01 3.233369e-01      0
## 6                     SSDI integer  0 1.095406e-01 3.125926e-01      0
## 7      VADisabilityService integer  0 1.219081e-01 3.274691e-01      0
## 8   VADisabilityNonService integer  0 3.886926e-02 1.934543e-01      0
## 9        PrivateDisability integer  0 1.766784e-03 4.203314e-02      0
## 10             WorkersComp integer  0 0.000000e+00 0.000000e+00      0
## 11                    TANF integer  0 3.533569e-02 1.847902e-01      0
## 12                      GA integer  0 0.000000e+00 0.000000e+00      0
## 13        SocSecRetirement integer  0 1.060071e-02 1.025031e-01      0
## 14                 Pension integer  0 1.590106e-02 1.252035e-01      0
## 15            ChildSupport integer  0 1.060071e-02 1.025031e-01      0
## 16                 Alimony integer  0 0.000000e+00 0.000000e+00      0
## 17       OtherIncomeSource integer  0 2.473498e-02 1.554537e-01      0
## 18                    SNAP integer  0 3.250883e-01 4.688222e-01      0
## 19                     WIC integer  0 1.590106e-02 1.252035e-01      0
## 20           TANFChildCare integer  0 5.300353e-03 7.267456e-02      0
## 21      TANFTransportation integer  0 0.000000e+00 0.000000e+00      0
## 22               OtherTANF integer  0 3.533569e-03 5.939120e-02      0
## 23 RentalAssistanceOngoing integer  0 8.833922e-03 9.365565e-02      0
## 24    RentalAssistanceTemp integer  0 0.000000e+00 0.000000e+00      0
## 25     OtherBenefitsSource integer  0 1.060071e-02 1.025031e-01      0
## 26  InsuranceFromAnySource integer 13 3.218807e-01 4.676205e-01      0
## 27                Medicaid integer  0 6.713781e-02 2.504819e-01      0
## 28                Medicare integer  0 2.120141e-02 1.441827e-01      0
## 29                   SCHIP integer  0 5.123675e-02 2.206753e-01      0
## 30       VAMedicalServices integer  0 4.593640e-02 2.095324e-01      0
## 31        EmployerProvided integer  0 1.413428e-02 1.181489e-01      0
## 32                   COBRA integer  0 0.000000e+00 0.000000e+00      0
## 33              PrivatePay integer  0 1.766784e-03 4.203314e-02      0
## 34          StateHealthIns integer  0 1.060071e-02 1.025031e-01      0
## 35     DataCollectionStage integer  0 1.742049e+00 8.686672e-01      1
##         mad   min    max nlevs
## 1  2570.828 90077 252131     0
## 2        NA     1     13   170
## 3     0.000     0      1     0
## 4     0.000     0      1     0
## 5     0.000     0      1     0
## 6     0.000     0      1     0
## 7     0.000     0      1     0
## 8     0.000     0      1     0
## 9     0.000     0      1     0
## 10    0.000     0      0     0
## 11    0.000     0      1     0
## 12    0.000     0      0     0
## 13    0.000     0      1     0
## 14    0.000     0      1     0
## 15    0.000     0      1     0
## 16    0.000     0      0     0
## 17    0.000     0      1     0
## 18    0.000     0      1     0
## 19    0.000     0      1     0
## 20    0.000     0      1     0
## 21    0.000     0      0     0
## 22    0.000     0      1     0
## 23    0.000     0      1     0
## 24    0.000     0      0     0
## 25    0.000     0      1     0
## 26    0.000     0      1     0
## 27    0.000     0      1     0
## 28    0.000     0      1     0
## 29    0.000     0      1     0
## 30    0.000     0      1     0
## 31    0.000     0      1     0
## 32    0.000     0      0     0
## 33    0.000     0      1     0
## 34    0.000     0      1     0
## 35    0.000     1      3     0
# Checking if the ID is unique in the dataset
nrow(IncomeBenefits) == length(table(IncomeBenefits$PersonalID))
## [1] FALSE
# fil missing data with median
IncomeBenefits$InsuranceFromAnySource[is.na(IncomeBenefits$InsuranceFromAnySource)] <- 0 

# remove zero variance
nzCol <- nearZeroVar(IncomeBenefits, saveMetrics = TRUE)
IncomeBenefits <- IncomeBenefits[, nzCol$nzv == FALSE]

IncomeBenefits$InformationDate <- NULL

Services.csv

dim(Services)
## [1] 2684    4
str(Services)
## 'data.frame':    2684 obs. of  4 variables:
##  $ PersonalID  : int  247032 248537 248586 248537 248608 189603 189603 249591 189044 248812 ...
##  $ DateProvided: Factor w/ 115 levels "1/1/2016","1/10/2016",..: 44 46 28 29 27 32 38 40 40 41 ...
##  $ RecordType  : int  144 144 144 144 144 144 152 152 144 144 ...
##  $ TypeProvided: int  2 2 2 6 2 2 12 1 6 2 ...
summarizeColumns(Services)
##           name    type na         mean         disp median      mad   min
## 1   PersonalID integer  0 2.153891e+05 54211.398580 247332 2910.344 90077
## 2 DateProvided  factor  0           NA     0.976155     NA       NA     1
## 3   RecordType integer  0 1.455410e+02     3.155463    144    0.000   144
## 4 TypeProvided integer  0 2.934426e+00     2.370339      2    0.000     1
##      max nlevs
## 1 252131     0
## 2     64   115
## 3    152     0
## 4     14     0
# Checking if the ID is unique in the dataset
nrow(Services) == length(table(Services$PersonalID))
## [1] FALSE
# binding two service groups into a unique identification
Services[Services$RecordType == 144, "RecordType"] = "A"
Services[Services$RecordType == 152, "RecordType"] = "B"
Services$TypeProvided <- paste0(Services$RecordType, as.character(Services$TypeProvided))

# bind small data together
Services[Services$TypeProvided == "A3", "TypeProvided"] <- "A5"
Services[Services$TypeProvided == "B10", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B11", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B14", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B3", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B5", "TypeProvided"] <- "B4"
Services$RecordType <- NULL

Save Semi-Cleaned Data

write.csv(Client, file = "Client_Cleaned.csv",row.names=FALSE)
write.csv(Disabilities, file = "Disabilities_Cleaned.csv",row.names=FALSE)
write.csv(EmploymentEducation, file = "EmploymentEducation_Cleaned.csv",row.names=FALSE)
write.csv(Enrollment, file = "Enrollment_Cleaned.csv",row.names=FALSE)
write.csv(Exit, file = "Exit_Cleaned.csv",row.names=FALSE)
write.csv(HealthAndDV, file = "HealthAndDV_Cleaned.csv",row.names=FALSE)
write.csv(IncomeBenefits, file = "IncomeBenefits_Cleaned.csv",row.names=FALSE)
write.csv(Services, file = "Services_Cleaned.csv",row.names=FALSE)

setwd("C:/Users/Kyu/Google Drive/Hackathon/data")
Client <- read.csv("Client_Cleaned.csv", na.strings=c("", " ", "NULL", NA), stringsAsFactors=FALSE)
Disabilities <- read.csv("Disabilities_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
EmploymentEducation <- read.csv("EmploymentEducation_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Enrollment <- read.csv("Enrollment_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Exit <- read.csv("Exit_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
HealthAndDV <- read.csv("HealthAndDV_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
IncomeBenefits <- read.csv("IncomeBenefits_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Services <- read.csv("Services_Cleaned.csv", na.strings=c("", " ", "NULL", NA))

Visualization

# Geo Map
library(maps)
library(ggmap)
library(dplyr)

# Load a map of STL into R:
STL <- get_map(location="Saint Louis", zoom=11)
LatLonCounts <- as.data.frame(table(round(Enrollment$longitude,2), round(Enrollment$latitude,2)))
LatLonCounts$Long <- as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat <- as.numeric(as.character(LatLonCounts$Var2))

g1 <- ggmap(STL) +
    geom_point(data=LatLonCounts, aes(x=Long, y=Lat, color=Freq, size=Freq)) +
    scale_colour_gradient(low="yellow", high="red")
g1

g2 <- ggmap(STL) +
    stat_density2d(data=Enrollment, aes(x=longitude, y=latitude, fill=..level..), geom="polygon", alpha=0.2) +
    scale_fill_gradient(low="yellow", high="red")
g2

# top 10 zipcodes with most homeless people
top10_zip <- Enrollment %>% 
    group_by(LastPermanentZIP) %>% 
    count() %>% 
    top_n(10, n) %>% 
    arrange(-n)

Merging data sets

library(dplyr)

# remove all the time variables
str(Client)
## 'data.frame':    301 obs. of  9 variables:
##  $ PersonalID        : int  90077 90703 90739 91161 91501 95126 95759 98250 99577 101863 ...
##  $ Gender.1          : int  1 1 1 1 1 0 0 1 1 1 ...
##  $ VeteranStatus.1   : int  1 1 0 1 1 1 1 1 1 1 ...
##  $ VeteranStatus.99  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MilitaryBranch.1  : int  0 0 0 1 0 0 1 1 0 1 ...
##  $ MilitaryBranch.2  : int  1 1 0 0 1 1 0 0 1 0 ...
##  $ Discharge_Status.1: int  0 1 0 1 1 1 0 1 1 1 ...
##  $ Discharge_Status.2: int  1 0 0 0 0 0 1 0 0 0 ...
##  $ Age               : int  6 6 4 6 4 3 5 5 5 5 ...
str(Disabilities)
## 'data.frame':    3318 obs. of  5 variables:
##  $ PersonalID          : int  90077 90077 90077 90077 90077 90077 90077 90077 90077 90077 ...
##  $ DisabilityType      : int  5 5 5 5 6 6 6 6 7 7 ...
##  $ DisabilityResponse  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage : int  1 2 3 2 1 2 3 2 1 2 ...
##  $ InformationDateMonth: int  10 1 11 6 10 1 11 6 10 1 ...
Disabilities$InformationDateMonth <- NULL

str(EmploymentEducation)
## 'data.frame':    192 obs. of  4 variables:
##  $ PersonalID          : int  117753 181183 117753 182784 90739 182784 106089 106089 240712 240713 ...
##  $ Employed            : int  1 1 1 1 1 1 0 0 0 0 ...
##  $ DataCollectionStage : int  1 1 3 1 1 3 1 3 1 1 ...
##  $ InformationDateMonth: int  1 2 4 5 8 12 10 1 7 7 ...
EmploymentEducation$DataCollectionStage <- NULL
EmploymentEducation$InformationDateMonth <- NULL

str(Enrollment)
## 'data.frame':    324 obs. of  8 variables:
##  $ LastPermanentZIP   : int  62040 62201 62202 62202 62202 62202 62202 62202 62202 62205 ...
##  $ PersonalID         : int  248176 249588 143804 249804 152213 174662 249797 249873 95126 249975 ...
##  $ EntryDate          : Factor w/ 130 levels "2012-01-05","2012-02-02",..: 57 114 120 118 120 120 118 120 118 108 ...
##  $ EntryDateMonth     : int  9 1 2 1 2 2 1 2 1 1 ...
##  $ latitude           : num  38.7 38.6 38.6 38.6 38.6 ...
##  $ longitude          : num  -90.1 -90.1 -90.2 -90.2 -90.2 ...
##  $ RelationshipToHoH.2: int  1 0 1 1 1 0 1 1 0 0 ...
##  $ RelationshipToHoH.3: int  0 0 0 0 0 0 0 0 0 0 ...
Enrollment$LastPermanentZIP <- NULL 
Enrollment$EntryDate <- NULL 
Enrollment$longitude <- NULL 
Enrollment$latitude <- NULL 

str(Exit)
## 'data.frame':    160 obs. of  4 variables:
##  $ PersonalID   : int  117753 181183 182784 106089 240712 174821 240942 90077 243203 245519 ...
##  $ ExitDate     : Factor w/ 62 levels "2012-04-05","2013-01-16",..: 1 62 30 2 44 3 3 18 44 60 ...
##  $ Destination  : int  3 1 5 2 5 5 5 5 5 2 ...
##  $ ExitDateMonth: int  4 2 12 1 1 1 1 11 1 2 ...
Exit$ExitDate <- NULL
Exit$ExitDateMonth <- NULL

str(HealthAndDV)
## 'data.frame':    430 obs. of  3 variables:
##  $ PersonalID            : int  117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
##  $ DomesticViolenceVictim: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage   : int  1 1 3 1 1 3 3 1 3 1 ...
HealthAndDV$DataCollectionStage <- NULL

str(IncomeBenefits)
## 'data.frame':    566 obs. of  10 variables:
##  $ PersonalID            : int  117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
##  $ Earned                : int  1 1 1 1 1 1 1 0 0 0 ...
##  $ SSI                   : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ SSDI                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ VADisabilityService   : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ SNAP                  : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ InsuranceFromAnySource: int  0 0 0 0 0 1 0 0 0 1 ...
##  $ Medicaid              : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ SCHIP                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DataCollectionStage   : int  1 1 3 1 1 3 3 1 3 1 ...
IncomeBenefits$DataCollectionStage <- NULL

str(Services)
## 'data.frame':    2684 obs. of  3 variables:
##  $ PersonalID  : int  247032 248537 248586 248537 248608 189603 189603 249591 189044 248812 ...
##  $ DateProvided: Factor w/ 115 levels "1/1/2016","1/10/2016",..: 44 46 28 29 27 32 38 40 40 41 ...
##  $ TypeProvided: Factor w/ 9 levels "A2","A4","A5",..: 1 1 1 4 1 1 6 5 4 1 ...
Services$DateProvided <- NULL

# Save as ML dataset
write.csv(Client, file = "Client_Cleaned_ML.csv",row.names=FALSE)
write.csv(Disabilities, file = "Disabilities_Cleaned_ML.csv",row.names=FALSE)
write.csv(EmploymentEducation, file = "EmploymentEducation_Cleaned_ML.csv",row.names=FALSE)
write.csv(Enrollment, file = "Enrollment_Cleaned_ML.csv",row.names=FALSE)
write.csv(Exit, file = "Exit_Cleaned_ML.csv",row.names=FALSE)
write.csv(HealthAndDV, file = "HealthAndDV_Cleaned_ML.csv",row.names=FALSE)
write.csv(IncomeBenefits, file = "IncomeBenefits_Cleaned_ML.csv",row.names=FALSE)
write.csv(Services, file = "Services_Cleaned_ML.csv",row.names=FALSE)

dim(Disabilities)
## [1] 3318    4
length(unique(Disabilities[ ,"PersonalID"]))
## [1] 301
dim(Services)
## [1] 2684    2
length(unique(Services[ ,"PersonalID"]))
## [1] 201
ml_data <- right_join(Disabilities, Services, by = "PersonalID")

dim(IncomeBenefits)
## [1] 566   9
length(unique(IncomeBenefits[ ,"PersonalID"]))
## [1] 301
ml_data <- right_join(ml_data, IncomeBenefits, by = "PersonalID")

dim(HealthAndDV)
## [1] 430   2
length(unique(HealthAndDV[ ,"PersonalID"]))
## [1] 259
ml_data <- right_join(ml_data, HealthAndDV, by = "PersonalID")

dim(Enrollment)
## [1] 324   4
length(unique(Enrollment[ ,"PersonalID"]))
## [1] 275
ml_data <- right_join(ml_data, Enrollment, by = "PersonalID")

dim(Client)
## [1] 301   9
length(unique(Client[ ,"PersonalID"]))
## [1] 301
ml_data <- right_join(ml_data, Client, by = "PersonalID")

dim(EmploymentEducation)
## [1] 192   2
length(unique(EmploymentEducation[ ,"PersonalID"]))
## [1] 110
ml_data <- right_join(ml_data, EmploymentEducation, by = "PersonalID")

dim(Exit)
## [1] 160   2
length(unique(Exit[ ,"PersonalID"]))
## [1] 158
ml_data <- right_join(ml_data, Exit, by = "PersonalID")

Imputing missing data after ‘join’

library(caTools)
# ml_data$Destination <- as.factor(ml_data$Destination)

# drop na values
ml_data <- ml_data[complete.cases(ml_data),]
round(sapply(ml_data, function(x){sum(is.na(x))/length(x)})*100, 3)
##             PersonalID         DisabilityType     DisabilityResponse 
##                      0                      0                      0 
##    DataCollectionStage           TypeProvided                 Earned 
##                      0                      0                      0 
##                    SSI                   SSDI    VADisabilityService 
##                      0                      0                      0 
##                   SNAP InsuranceFromAnySource               Medicaid 
##                      0                      0                      0 
##                  SCHIP DomesticViolenceVictim         EntryDateMonth 
##                      0                      0                      0 
##    RelationshipToHoH.2    RelationshipToHoH.3               Gender.1 
##                      0                      0                      0 
##        VeteranStatus.1       VeteranStatus.99       MilitaryBranch.1 
##                      0                      0                      0 
##       MilitaryBranch.2     Discharge_Status.1     Discharge_Status.2 
##                      0                      0                      0 
##                    Age               Employed            Destination 
##                      0                      0                      0
ml_data$PersonalID <- NULL
ml_data$TypeProvided <- as.factor(ml_data$TypeProvided)

# save combined data
write.csv(ml_data, file = "ml_data.csv",row.names=FALSE)

# Spliting data
subsetD <- ml_data
# subsetD <- sample(ml_data)[0:5000,]
set.seed(2000)
split <- sample.split(subsetD$Destination, SplitRatio=0.7)
train <- subset(subsetD, split==TRUE)
test <- subset(subsetD, split==FALSE)

Feature Selection

# load library for machine learning
library(mlr)
library(FSelector)

# create task
train.task <- makeClassifTask(data=train, target="Destination")
test.task <- makeClassifTask(data=test, target="Destination")

# remove zero variance features
train.task <- removeConstantFeatures(train.task)
## Removing 3 columns: RelationshipToHoH.2,RelationshipToHoH.3,VeteranStatus.99
test.task <- removeConstantFeatures(test.task)
## Removing 3 columns: RelationshipToHoH.2,RelationshipToHoH.3,VeteranStatus.99
# get variable importance chart
var_imp <- generateFilterValuesData(train.task, method=c("information.gain"))
plotFilterValues(var_imp, feat.type.cols=TRUE)

# select only important variables
imp_feat <- (var_imp$data %>% arrange(-information.gain) %>% top_n(7))$name
## Selecting by information.gain
imp_feat <- c(imp_feat, "Destination") # add target variable

# create task
train <- train[ ,colnames(train) %in% imp_feat]
test <- test[ ,colnames(test) %in% imp_feat]
                
trainTask <- makeClassifTask(data=train, target="Destination")
testTask <- makeClassifTask(data=test, target="Destination")

Processing Hyper-parameter Tunning with Cross-Validation

# getParamSet("classif.randomForest")

# create a learner
# rf <- makeLearner("classif.randomForest", predict.type="response", par.vals=list(ntree=200, mtry=3))
# rf$par.vals <- list(importance=TRUE)
# 
# # set tunable parameters
# # grid search to find hyperparameters
# rf_param <- makeParamSet(makeIntegerParam("ntree", lower=5, upper=10), 
#                          makeIntegerParam("mtry", lower=5, upper=10), 
#                          makeIntegerParam("nodesize", lower=25, upper=50))
# 
# #  random search for 50 iterations
# rancontrol <- makeTuneControlRandom(maxit=50L)
# 
# # set 3 fold cross validation
# set_cv <- makeResampleDesc("CV", iters=3L)
# 
# # hypertuning
# rf_tune <- tuneParams(learner=rf, resampling=set_cv, task=train.task, par.set=rf_param, control=rancontrol, measures=acc)

Building Model

library(randomForest)
#best parameters
# rf_tune$x
# $ntree
# [1] 9
# 
# $mtry
# [1] 5
# 
# $nodesize
# [1] 40

# cv accuracy
# rf_tune$y
# acc.test.mean 
#     0.8829761 

# using hyperparameters for modeling
# rf.tree <- setHyperPars(rf, par.vals=rf_tune$x)

# train a model
# rforest <- train(rf.tree, trainTask)
# getLearnerModel(t.rpart)

# make predictions
# rfmodel <- predict(rforest, test.task)


mdl.rf <- randomForest(as.factor(Destination) ~ .,
                       data = train,
                       replace = F, ntree = 9,
                       do.trace = F, mtry = 5, nodesize = 40)

pred.rf.test <- predict(mdl.rf, test)
conf.mtx <- confusionMatrix(pred.rf.test, test$Destination)
conf.mtx
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction      1      2      3      4      5
##          1   8864      0   8994      0      0
##          2      0    294      0      0      0
##          3   9957   2449  19056      0    153
##          4      0      0      0   3991      0
##          5      0      0     73      0 107008
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8655          
##                  95% CI : (0.8639, 0.8672)
##     No Information Rate : 0.6663          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7355          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.47096 0.107182   0.6776  1.00000   0.9986
## Specificity           0.93667 1.000000   0.9054  1.00000   0.9986
## Pos Pred Value        0.49636 1.000000   0.6028  1.00000   0.9993
## Neg Pred Value        0.93036 0.984746   0.9298  1.00000   0.9972
## Prevalence            0.11702 0.017054   0.1749  0.02481   0.6663
## Detection Rate        0.05511 0.001828   0.1185  0.02481   0.6653
## Detection Prevalence  0.11103 0.001828   0.1966  0.02481   0.6658
## Balanced Accuracy     0.70382 0.553591   0.7915  1.00000   0.9986

Save Models

save(mdl.rf, file = "mdl.rf.RData")
save(conf.mtx, file = "conf.mtx.RData")
save(test, file = "test.RData")
save(top10_zip, file = "top10_zip.RData")
save(g1, file = "g1.RData")
save(g2, file = "g2.RData")
save(var_imp, file = "var_imp.RData")

Sql Query

# library("RMySQL")
# 
# # GET THAT DATABASE CONNECTION 
# mydb <- dbConnect(RMySQL::MySQL(), user='GHack', password='GlobalHack123!', 
#                dbname='globalhack', host='Globalhack.il1.rdbs.ctl.io', port=49424)
# 
# # FETCH NAMES & ADDRESSES!  
# result <- dbSendQuery(mydb, "select * 
#                 from CleanClient c
#                 LEFT JOIN (select * from CleanDisabilities group by CleanDisabilities.PersonalID) d on c.PersonalID = d.PersonalID
#                 LEFT JOIN (select * from CleanEmploymentEducation group by PersonalID) e on c.PersonalID = e.PersonalID
#                 LEFT JOIN (select * from CleanExitData group by PersonalID) x on c.PersonalID = x.PersonalID
#                 LEFT JOIN (select * from CleanHealthAndDV group by PersonalID) h on c.PersonalID = h.PersonalID
#                 LEFT JOIN (select * from CleanIncomeBenefits group by PersonalID) i on c.PersonalID = i.PersonalID
#                 LEFT JOIN (select * from CleanServices group by PersonalID) s on c.PersonalID = s.PersonalID
#                 LEFT JOIN (select * from CleanEnrollment group by PersonalID) r on c.PersonalID = r.PersonalID")
# names <- fetch(result, n=-1) 
# dbClearResult(result)